Import data with pandas using read_table



In [129]:

    
import pandas as pd
df = pd.read_table('smsspamcollection/SMSSpamCollection',
                   sep='\t', 
                   header=None, 
                   names=['label', 'sms_message'])
df.head()









    Out[129]:







  
    
      
      label
      sms_message
    
  
  
    
      0
      ham
      Go until jurong point, crazy.. Available only ...
    
    
      1
      ham
      Ok lar... Joking wif u oni...
    
    
      2
      spam
      Free entry in 2 a wkly comp to win FA Cup fina...
    
    
      3
      ham
      U dun say so early hor... U c already then say...
    
    
      4
      ham
      Nah I don't think he goes to usf, he lives aro...

Clean the data ( ham:0 spam:1)



In [130]:

    
df['label'] = df.label.map({'ham':0,'spam':1})
df.head()









    Out[130]:







  
    
      
      label
      sms_message
    
  
  
    
      0
      0
      Go until jurong point, crazy.. Available only ...
    
    
      1
      0
      Ok lar... Joking wif u oni...
    
    
      2
      1
      Free entry in 2 a wkly comp to win FA Cup fina...
    
    
      3
      0
      U dun say so early hor... U c already then say...
    
    
      4
      0
      Nah I don't think he goes to usf, he lives aro...

Implement bag of words



In [131]:

    
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()



In [132]:

    
from sklearn.model_selection import train_test_split
#from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], 
                                                    df['label'], 
                                                    random_state=1)
print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))









    



Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393



In [133]:

    
training_data = count_vector.fit_transform(X_train)

testing_data = count_vector.transform(X_test)



In [134]:

    
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)









    Out[134]:





MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)



In [140]:

    
test_sample = []
inputvar=input('Please enter the sms message to classify it as Spam or Not!')
test_sample.append(inputvar)
print(test_sample)
doc_sample = count_vector.transform(test_sample)
predictions = naive_bayes.predict(doc_sample)
prediction = naive_bayes.predict(testing_data)
status = predictions[0]
print(predictions)
if status == 0:
    print("Not spam")
else:
    print("spam")









    



Please enter the sms message to classify it as Spam or Not!get free sms data
['get free sms data']
[1]
spam



In [139]:

    
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test, prediction)))
print('Precision score: ', format(precision_score(y_test, prediction)))
print('Recall score: ', format(recall_score(y_test, prediction)))
print('F1 score: ', format(f1_score(y_test, prediction)))









    



Accuracy score:  0.9885139985642498
Precision score:  0.9720670391061452
Recall score:  0.9405405405405406
F1 score:  0.9560439560439562

	label	sms_message
0	ham	Go until jurong point, crazy.. Available only ...
1	ham	Ok lar... Joking wif u oni...
2	spam	Free entry in 2 a wkly comp to win FA Cup fina...
3	ham	U dun say so early hor... U c already then say...
4	ham	Nah I don't think he goes to usf, he lives aro...